From a2966a3a227fcd392a50cbc5dc4835588327788d Mon Sep 17 00:00:00 2001
From: Seth Hollyman <shollyman@google.com>
Date: Fri, 14 Feb 2020 19:34:38 +0000
Subject: [PATCH 1/3] fix: bigquery/bqml testing

The BQML tests use a non-unique dataset ID for multiple examples and
testing currently triggers a lot of concurrent creation/deletions of
said dataset.  Switch to a dataset that leverages uuid to avoid
invocations stomping on one another, which also necessitates
parameterizing much of the SQL.

There's also an issue with the pandas import currently, possibly due to
recent changes in panda.  This change also pins pandas to 0.22 and
doesn't rely on the dependency being expressed as an extra through
google-cloud-bigquery.
---
 bigquery/bqml/data_scientist_tutorial_test.py | 29 +++++++++-------
 bigquery/bqml/ncaa_tutorial_test.py           | 33 +++++++++++--------
 bigquery/bqml/requirements.txt                |  3 +-
 .../bqml/resources/training_data_query.sql    |  4 +--
 4 files changed, 40 insertions(+), 29 deletions(-)

diff --git a/bigquery/bqml/data_scientist_tutorial_test.py b/bigquery/bqml/data_scientist_tutorial_test.py
index 532835294d1..8d1acb372c5 100644
--- a/bigquery/bqml/data_scientist_tutorial_test.py
+++ b/bigquery/bqml/data_scientist_tutorial_test.py
@@ -16,9 +16,14 @@
 from google.cloud import bigquery
 # [END bqml_data_scientist_tutorial_import_and_client]
 import pytest
+import uuid
 
 # [START bqml_data_scientist_tutorial_import_and_client]
 client = bigquery.Client()
+# We use a unique dataset ID for this example to avoid collisions with 
+# other invocations of this tutorial.  In practice, you could leverage
+# a persistent dataset and not create/destroy it with each invocation.
+dataset_id = "bqml_tutorial_{}".format(str(uuid.uuid4().hex))
 # [END bqml_data_scientist_tutorial_import_and_client]
 
 
@@ -26,19 +31,19 @@
 def delete_dataset():
     yield
     client.delete_dataset(
-        client.dataset('bqml_tutorial'), delete_contents=True)
+        client.dataset(dataset_id), delete_contents=True)
 
 
 def test_data_scientist_tutorial(delete_dataset):
     # [START bqml_data_scientist_tutorial_create_dataset]
-    dataset = bigquery.Dataset(client.dataset('bqml_tutorial'))
+    dataset = bigquery.Dataset(client.dataset(dataset_id))
     dataset.location = 'US'
     client.create_dataset(dataset)
     # [END bqml_data_scientist_tutorial_create_dataset]
 
     # [START bqml_data_scientist_tutorial_create_model]
     sql = """
-        CREATE OR REPLACE MODEL `bqml_tutorial.sample_model`
+        CREATE OR REPLACE MODEL `{}.sample_model`
         OPTIONS(model_type='logistic_reg') AS
         SELECT
             IF(totals.transactions IS NULL, 0, 1) AS label,
@@ -50,7 +55,7 @@ def test_data_scientist_tutorial(delete_dataset):
             `bigquery-public-data.google_analytics_sample.ga_sessions_*`
         WHERE
             _TABLE_SUFFIX BETWEEN '20160801' AND '20170630'
-    """
+    """.format(dataset_id)
     df = client.query(sql).to_dataframe()
     print(df)
     # [END bqml_data_scientist_tutorial_create_model]
@@ -60,8 +65,8 @@ def test_data_scientist_tutorial(delete_dataset):
         SELECT
         *
         FROM
-        ML.TRAINING_INFO(MODEL `bqml_tutorial.sample_model`)
-    """
+        ML.TRAINING_INFO(MODEL `{}.sample_model`)
+    """.format(dataset_id)
     df = client.query(sql).to_dataframe()
     print(df)
     # [END bqml_data_scientist_tutorial_get_training_statistics]
@@ -70,7 +75,7 @@ def test_data_scientist_tutorial(delete_dataset):
     sql = """
         SELECT
             *
-        FROM ML.EVALUATE(MODEL `bqml_tutorial.sample_model`, (
+        FROM ML.EVALUATE(MODEL `{}.sample_model`, (
             SELECT
                 IF(totals.transactions IS NULL, 0, 1) AS label,
                 IFNULL(device.operatingSystem, "") AS os,
@@ -81,7 +86,7 @@ def test_data_scientist_tutorial(delete_dataset):
                 `bigquery-public-data.google_analytics_sample.ga_sessions_*`
             WHERE
                 _TABLE_SUFFIX BETWEEN '20170701' AND '20170801'))
-    """
+    """.format(dataset_id)
     df = client.query(sql).to_dataframe()
     print(df)
     # [END bqml_data_scientist_tutorial_evaluate_model]
@@ -91,7 +96,7 @@ def test_data_scientist_tutorial(delete_dataset):
         SELECT
             country,
             SUM(predicted_label) as total_predicted_purchases
-        FROM ML.PREDICT(MODEL `bqml_tutorial.sample_model`, (
+        FROM ML.PREDICT(MODEL `{}.sample_model`, (
             SELECT
                 IFNULL(device.operatingSystem, "") AS os,
                 device.isMobile AS is_mobile,
@@ -104,7 +109,7 @@ def test_data_scientist_tutorial(delete_dataset):
             GROUP BY country
             ORDER BY total_predicted_purchases DESC
             LIMIT 10
-    """
+    """.format(dataset_id)
     df = client.query(sql).to_dataframe()
     print(df)
     # [END bqml_data_scientist_tutorial_predict_transactions]
@@ -114,7 +119,7 @@ def test_data_scientist_tutorial(delete_dataset):
         SELECT
             fullVisitorId,
             SUM(predicted_label) as total_predicted_purchases
-        FROM ML.PREDICT(MODEL `bqml_tutorial.sample_model`, (
+        FROM ML.PREDICT(MODEL `{}.sample_model`, (
             SELECT
                 IFNULL(device.operatingSystem, "") AS os,
                 device.isMobile AS is_mobile,
@@ -128,7 +133,7 @@ def test_data_scientist_tutorial(delete_dataset):
             GROUP BY fullVisitorId
             ORDER BY total_predicted_purchases DESC
             LIMIT 10
-    """
+    """.format(dataset_id)
     df = client.query(sql).to_dataframe()
     print(df)
     # [END bqml_data_scientist_tutorial_predict_purchases]
diff --git a/bigquery/bqml/ncaa_tutorial_test.py b/bigquery/bqml/ncaa_tutorial_test.py
index 5fd96a3961f..d17fac3a9c7 100644
--- a/bigquery/bqml/ncaa_tutorial_test.py
+++ b/bigquery/bqml/ncaa_tutorial_test.py
@@ -14,6 +14,7 @@
 
 import io
 import os
+import uuid
 
 # [START bqml_ncaa_tutorial_import_and_client]
 from google.cloud import bigquery
@@ -22,6 +23,10 @@
 
 # [START bqml_ncaa_tutorial_import_and_client]
 client = bigquery.Client()
+# We use a unique dataset ID for this example to avoid collisions with 
+# other invocations of this tutorial.  In practice, you could leverage
+# a persistent dataset and not create/destroy it with each invocation.
+dataset_id = "bqml_tutorial_{}".format(str(uuid.uuid4().hex))
 # [END bqml_ncaa_tutorial_import_and_client]
 
 
@@ -29,12 +34,12 @@
 def delete_dataset():
     yield
     client.delete_dataset(
-        client.dataset('bqml_tutorial'), delete_contents=True)
+        client.dataset(dataset_id), delete_contents=True)
 
 
 def test_ncaa_tutorial(delete_dataset):
     # [START bqml_ncaa_tutorial_create_dataset]
-    dataset = bigquery.Dataset(client.dataset('bqml_tutorial'))
+    dataset = bigquery.Dataset(client.dataset(dataset_id))
     dataset.location = 'US'
     client.create_dataset(dataset)
     # [END bqml_ncaa_tutorial_create_dataset]
@@ -53,12 +58,12 @@ def test_ncaa_tutorial(delete_dataset):
         job_config.destination = table_ref
         query_filepath = os.path.join(
             resources_directory, query_filepath)
-        sql = io.open(query_filepath, 'r', encoding='utf-8').read()
+        sql = io.open(query_filepath, 'r', encoding='utf-8').read().format(dataset_id)
         client.query(sql, job_config=job_config).result()
 
     # [START bqml_ncaa_tutorial_create_model]
     sql = """
-        CREATE OR REPLACE MODEL `bqml_tutorial.ncaa_model`
+        CREATE OR REPLACE MODEL `{0}.ncaa_model`
         OPTIONS (
             model_type='linear_reg',
             max_iteration=50 ) AS
@@ -69,11 +74,11 @@ def test_ncaa_tutorial(delete_dataset):
                 total_three_points_att),
             total_three_points_att as label
         FROM
-            `bqml_tutorial.wide_games`
+            `{0}.wide_games`
         WHERE
             # remove the game to predict
             game_id != 'f1063e80-23c7-486b-9a5e-faa52beb2d83'
-    """
+    """.format(dataset_id)
     df = client.query(sql).to_dataframe()
     print(df)
     # [END bqml_ncaa_tutorial_create_model]
@@ -83,8 +88,8 @@ def test_ncaa_tutorial(delete_dataset):
         SELECT
             *
         FROM
-            ML.TRAINING_INFO(MODEL `bqml_tutorial.ncaa_model`)
-    """
+            ML.TRAINING_INFO(MODEL `{}.ncaa_model`)
+    """.format(dataset_id)
     df = client.query(sql).to_dataframe()
     print(df)
     # [END bqml_ncaa_tutorial_get_training_statistics]
@@ -96,13 +101,13 @@ def test_ncaa_tutorial(delete_dataset):
                 *,
                 total_three_points_att AS label
             FROM
-                `bqml_tutorial.wide_games` )
+                `{0}.wide_games` )
         SELECT
             *
         FROM
-            ML.EVALUATE(MODEL `bqml_tutorial.ncaa_model`,
+            ML.EVALUATE(MODEL `{0}.ncaa_model`,
                 TABLE eval_table)
-    """
+    """.format(dataset_id)
     df = client.query(sql).to_dataframe()
     print(df)
     # [END bqml_ncaa_tutorial_evaluate_model]
@@ -113,7 +118,7 @@ def test_ncaa_tutorial(delete_dataset):
             SELECT
                 *
             FROM
-                `bqml_tutorial.wide_games`
+                `{0}.wide_games`
             WHERE
                 game_id='f1063e80-23c7-486b-9a5e-faa52beb2d83' )
         SELECT
@@ -125,7 +130,7 @@ def test_ncaa_tutorial(delete_dataset):
                 game_id,
                 predicted_label AS predicted_total_three_points_att
             FROM
-                ML.PREDICT(MODEL `bqml_tutorial.ncaa_model`,
+                ML.PREDICT(MODEL `{0}.ncaa_model`,
                 table game_to_predict) ) AS predict
         JOIN (
             SELECT
@@ -135,7 +140,7 @@ def test_ncaa_tutorial(delete_dataset):
                 game_to_predict) AS truth
         ON
             predict.game_id = truth.game_id
-    """
+    """.format(dataset_id)
     df = client.query(sql).to_dataframe()
     print(df)
     # [END bqml_ncaa_tutorial_predict_outcomes]
diff --git a/bigquery/bqml/requirements.txt b/bigquery/bqml/requirements.txt
index 86141411cac..0fbfc4124cc 100644
--- a/bigquery/bqml/requirements.txt
+++ b/bigquery/bqml/requirements.txt
@@ -1,3 +1,4 @@
-google-cloud-bigquery[pandas]==1.20.0
+pandas==0.22
+google-cloud-bigquery>=1.24.0
 flaky==3.6.1
 mock==3.0.5
diff --git a/bigquery/bqml/resources/training_data_query.sql b/bigquery/bqml/resources/training_data_query.sql
index 74f39e9f0aa..6674e0f069b 100644
--- a/bigquery/bqml/resources/training_data_query.sql
+++ b/bigquery/bqml/resources/training_data_query.sql
@@ -768,9 +768,9 @@ SELECT
   opponent.opp_possessions_std_last_5 AS	opponent_opp_possessions_std_last_5,
   opponent.opp_possessions_std_last_10 AS	opponent_opp_possessions_std_last_10
 FROM
-  `bqml_tutorial.cume_games` AS team
+  `{0}.cume_games` AS team
 JOIN
-  `bqml_tutorial.cume_games` AS opponent
+  `{0}.cume_games` AS opponent
 ON
   team.game_id = opponent.game_id AND team.team_id != opponent.team_id
 WHERE

From 4186aa68428025000c3645ad6d63428fb8c942ac Mon Sep 17 00:00:00 2001
From: Seth Hollyman <shollyman@google.com>
Date: Fri, 14 Feb 2020 19:42:49 +0000
Subject: [PATCH 2/3] whitespace lint

---
 bigquery/bqml/data_scientist_tutorial_test.py   |  2 +-
 bigquery/bqml/ncaa_tutorial_test.py             | 16 +++++-----------
 bigquery/bqml/resources/feature_input_query.sql |  6 +++++-
 bigquery/bqml/resources/training_data_query.sql |  1 +
 4 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/bigquery/bqml/data_scientist_tutorial_test.py b/bigquery/bqml/data_scientist_tutorial_test.py
index 8d1acb372c5..eb4ef4b821b 100644
--- a/bigquery/bqml/data_scientist_tutorial_test.py
+++ b/bigquery/bqml/data_scientist_tutorial_test.py
@@ -20,7 +20,7 @@
 
 # [START bqml_data_scientist_tutorial_import_and_client]
 client = bigquery.Client()
-# We use a unique dataset ID for this example to avoid collisions with 
+# We use a unique dataset ID for this example to avoid collisions with
 # other invocations of this tutorial.  In practice, you could leverage
 # a persistent dataset and not create/destroy it with each invocation.
 dataset_id = "bqml_tutorial_{}".format(str(uuid.uuid4().hex))
diff --git a/bigquery/bqml/ncaa_tutorial_test.py b/bigquery/bqml/ncaa_tutorial_test.py
index d17fac3a9c7..488684d8666 100644
--- a/bigquery/bqml/ncaa_tutorial_test.py
+++ b/bigquery/bqml/ncaa_tutorial_test.py
@@ -23,7 +23,7 @@
 
 # [START bqml_ncaa_tutorial_import_and_client]
 client = bigquery.Client()
-# We use a unique dataset ID for this example to avoid collisions with 
+# We use a unique dataset ID for this example to avoid collisions with
 # other invocations of this tutorial.  In practice, you could leverage
 # a persistent dataset and not create/destroy it with each invocation.
 dataset_id = "bqml_tutorial_{}".format(str(uuid.uuid4().hex))
@@ -47,19 +47,13 @@ def test_ncaa_tutorial(delete_dataset):
     # Create the tables used by the tutorial
     # Note: the queries are saved to a file. This should be updated to use the
     # saved queries once the library supports running saved queries.
-    query_filepath_to_table_name = {
-        'feature_input_query.sql': 'cume_games',
-        'training_data_query.sql': 'wide_games'
-    }
+    query_files = ['feature_input_query.sql', 'training_data_query.sql']
     resources_directory = os.path.join(os.path.dirname(__file__), 'resources')
-    for query_filepath, table_name in query_filepath_to_table_name.items():
-        table_ref = dataset.table(table_name)
-        job_config = bigquery.QueryJobConfig()
-        job_config.destination = table_ref
+    for fname in query_files:
         query_filepath = os.path.join(
-            resources_directory, query_filepath)
+            resources_directory, fname)
         sql = io.open(query_filepath, 'r', encoding='utf-8').read().format(dataset_id)
-        client.query(sql, job_config=job_config).result()
+        client.query(sql).result()
 
     # [START bqml_ncaa_tutorial_create_model]
     sql = """
diff --git a/bigquery/bqml/resources/feature_input_query.sql b/bigquery/bqml/resources/feature_input_query.sql
index d54f003425d..6348c7ea462 100644
--- a/bigquery/bqml/resources/feature_input_query.sql
+++ b/bigquery/bqml/resources/feature_input_query.sql
@@ -1,4 +1,8 @@
-#standardSQL
+# This query creates a sample table using
+# the ncaa_basketball public dataset.  It
+# uses a format string token for setting
+# the destination dataset.
+CREATE OR REPLACE TABLE `{0}.cume_games` AS
 SELECT
   game_id,
   season,
diff --git a/bigquery/bqml/resources/training_data_query.sql b/bigquery/bqml/resources/training_data_query.sql
index 6674e0f069b..5c2f6708d4e 100644
--- a/bigquery/bqml/resources/training_data_query.sql
+++ b/bigquery/bqml/resources/training_data_query.sql
@@ -1,4 +1,5 @@
 #standardSQL
+CREATE OR REPLACE TABLE `{0}.wide_games` AS
 SELECT
   team.game_id AS game_id,
   team.season AS season,

From efb5b0536e0080ab906aaf53be959f3e2819af36 Mon Sep 17 00:00:00 2001
From: Seth Hollyman <shollyman@google.com>
Date: Sat, 15 Feb 2020 00:23:56 +0000
Subject: [PATCH 3/3] update dependencies in requirements.txt

---
 bigquery/bqml/requirements.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bigquery/bqml/requirements.txt b/bigquery/bqml/requirements.txt
index 0fbfc4124cc..dee3fe25961 100644
--- a/bigquery/bqml/requirements.txt
+++ b/bigquery/bqml/requirements.txt
@@ -1,4 +1,4 @@
-pandas==0.22
-google-cloud-bigquery>=1.24.0
+pandas==0.24.2
+google-cloud-bigquery==1.23.1
 flaky==3.6.1
 mock==3.0.5