docs(samples): Add Dataflow BigQueryIO write snippets (#8414)

VeronicaWasson · web-flow · commit 7ed341eff489 · 2023-07-13T13:24:23.000-07:00
* BigQuery write samples

* Add license headers

* Fix lint errors

---------

Co-authored-by: Veronica Wasson &lt;VeronicaWasson@users.noreply.github.com&gt;
diff --git a/dataflow/snippets/src/main/java/com/example/dataflow/BigQueryReadAvro.java b/dataflow/snippets/src/main/java/com/example/dataflow/BigQueryReadAvro.java
@@ -55,10 +55,10 @@ public static void main(String[] args) {
     // Parse the pipeline options passed into the application. Example:
     //   --projectId=$PROJECT_ID --datasetName=$DATASET_NAME --tableName=$TABLE_NAME
     // For more information, see https://beam.apache.org/documentation/programming-guide/#configuring-pipeline-options
-    PipelineOptionsFactory.register(BigQueryReadOptions.class);
-    BigQueryReadOptions options = PipelineOptionsFactory.fromArgs(args)
+    PipelineOptionsFactory.register(ExamplePipelineOptions.class);
+    ExamplePipelineOptions options = PipelineOptionsFactory.fromArgs(args)
         .withValidation()
-        .as(BigQueryReadOptions.class);
+        .as(ExamplePipelineOptions.class);
 
     // Create a pipeline and apply transforms.
     Pipeline pipeline = Pipeline.create(options);
diff --git a/dataflow/snippets/src/main/java/com/example/dataflow/BigQueryReadWithProjectionAndFiltering.java b/dataflow/snippets/src/main/java/com/example/dataflow/BigQueryReadWithProjectionAndFiltering.java
@@ -31,10 +31,10 @@ public static void main(String[] args) {
     // Parse the pipeline options passed into the application. Example:
     //   --projectId=$PROJECT_ID --datasetName=$DATASET_NAME --tableName=$TABLE_NAME
     // For more information, see https://beam.apache.org/documentation/programming-guide/#configuring-pipeline-options
-    PipelineOptionsFactory.register(BigQueryReadOptions.class);
-    BigQueryReadOptions options = PipelineOptionsFactory.fromArgs(args)
+    PipelineOptionsFactory.register(ExamplePipelineOptions.class);
+    ExamplePipelineOptions options = PipelineOptionsFactory.fromArgs(args)
         .withValidation()
-        .as(BigQueryReadOptions.class);
+        .as(ExamplePipelineOptions.class);
 
     // Create a pipeline and apply transforms.
     Pipeline pipeline = Pipeline.create(options);
diff --git a/dataflow/snippets/src/main/java/com/example/dataflow/BigQueryStreamExactlyOnce.java b/dataflow/snippets/src/main/java/com/example/dataflow/BigQueryStreamExactlyOnce.java
@@ -0,0 +1,90 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.dataflow;
+
+// [START dataflow_bigquery_stream_exactly_once]
+import com.google.api.services.bigquery.model.TableRow;
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.PipelineResult;
+import org.apache.beam.sdk.coders.StringUtf8Coder;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
+import org.apache.beam.sdk.options.PipelineOptionsFactory;
+import org.apache.beam.sdk.testing.TestStream;
+import org.apache.beam.sdk.transforms.MapElements;
+import org.apache.beam.sdk.values.TimestampedValue;
+import org.apache.beam.sdk.values.TypeDescriptor;
+import org.joda.time.Duration;
+import org.joda.time.Instant;
+
+public class BigQueryStreamExactlyOnce {
+  // Create a PTransform that sends simulated streaming data. In a real application, the data
+  // source would be an external source, such as Pub/Sub.
+  private static TestStream<String> createEventSource() {
+    Instant startTime = new Instant(0);
+    return TestStream.create(StringUtf8Coder.of())
+        .advanceWatermarkTo(startTime)
+        .addElements(
+            TimestampedValue.of("Alice,20", startTime),
+            TimestampedValue.of("Bob,30",
+                startTime.plus(Duration.standardSeconds(1))),
+            TimestampedValue.of("Charles,40",
+                startTime.plus(Duration.standardSeconds(2))))
+        .advanceWatermarkToInfinity();
+  }
+
+  public static PipelineResult main(String[] args) {
+    // Parse the pipeline options passed into the application. Example:
+    //   --projectId=$PROJECT_ID --datasetName=$DATASET_NAME --tableName=$TABLE_NAME
+    // For more information, see https://beam.apache.org/documentation/programming-guide/#configuring-pipeline-options
+    PipelineOptionsFactory.register(ExamplePipelineOptions.class);
+    ExamplePipelineOptions options = PipelineOptionsFactory.fromArgs(args)
+        .withValidation()
+        .as(ExamplePipelineOptions.class);
+    options.setStreaming(true);
+
+    // Create a pipeline and apply transforms.
+    Pipeline pipeline = Pipeline.create(options);
+    pipeline
+        // Add a streaming data source.
+        .apply(createEventSource())
+        // Map the event data into TableRow objects.
+        .apply(MapElements
+            .into(TypeDescriptor.of(TableRow.class))
+            .via((String x) -> {
+              String[] columns = x.split(",");
+              return new TableRow().set("user_name", columns[0]).set("age", columns[1]);
+            }))
+        // Write the rows to BigQuery
+        .apply(BigQueryIO.writeTableRows()
+            .to(String.format("%s:%s.%s",
+                options.getProjectId(),
+                options.getDatasetName(),
+                options.getTableName()))
+            .withCreateDisposition(CreateDisposition.CREATE_NEVER)
+            .withWriteDisposition(WriteDisposition.WRITE_APPEND)
+            .withMethod(Write.Method.STORAGE_WRITE_API)
+            // For exactly-once processing, set the number of Write API streams and the triggering
+            // frequency.
+            .withNumStorageWriteApiStreams(1)
+            .withTriggeringFrequency(Duration.standardSeconds(5)));
+    return pipeline.run();
+  }
+}
+// [END dataflow_bigquery_stream_exactly_once]
diff --git a/dataflow/snippets/src/main/java/com/example/dataflow/BigQueryWrite.java b/dataflow/snippets/src/main/java/com/example/dataflow/BigQueryWrite.java
@@ -0,0 +1,83 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.dataflow;
+
+// [START dataflow_bigquery_write]
+import com.google.api.services.bigquery.model.TableRow;
+import java.util.Arrays;
+import java.util.List;
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.coders.AvroCoder;
+import org.apache.beam.sdk.coders.DefaultCoder;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.WriteDisposition;
+import org.apache.beam.sdk.options.PipelineOptionsFactory;
+import org.apache.beam.sdk.transforms.Create;
+
+public class BigQueryWrite {
+  // A custom datatype for the source data.
+  @DefaultCoder(AvroCoder.class)
+  public static class MyData {
+    public String name;
+    public Long age;
+
+    public MyData() {}
+
+    public MyData(String name, Long age) {
+      this.name = name;
+      this.age = age;
+    }
+  }
+
+  public static void main(String[] args) {
+    // Example source data.
+    final List<MyData> data = Arrays.asList(
+        new MyData("Alice", 40L),
+        new MyData("Bob", 30L),
+        new MyData("Charlie", 20L)
+    );
+
+    // Parse the pipeline options passed into the application. Example:
+    //   --projectId=$PROJECT_ID --datasetName=$DATASET_NAME --tableName=$TABLE_NAME
+    // For more information, see https://beam.apache.org/documentation/programming-guide/#configuring-pipeline-options
+    PipelineOptionsFactory.register(ExamplePipelineOptions.class);
+    ExamplePipelineOptions options = PipelineOptionsFactory.fromArgs(args)
+        .withValidation()
+        .as(ExamplePipelineOptions.class);
+
+    // Create a pipeline and apply transforms.
+    Pipeline pipeline = Pipeline.create(options);
+    pipeline
+        // Create an in-memory PCollection of MyData objects.
+        .apply(Create.of(data))
+        // Write the data to an exiting BigQuery table.
+        .apply(BigQueryIO.<MyData>write()
+            .to(String.format("%s:%s.%s",
+                options.getProjectId(),
+                options.getDatasetName(),
+                options.getTableName()))
+            .withFormatFunction(
+                (MyData x) -> new TableRow().set("user_name", x.name).set("age", x.age))
+            .withCreateDisposition(CreateDisposition.CREATE_NEVER)
+            .withWriteDisposition(WriteDisposition.WRITE_APPEND)
+            .withMethod(Write.Method.STORAGE_WRITE_API));
+    pipeline.run().waitUntilFinish();
+  }
+}
+// [END dataflow_bigquery_write]
diff --git a/dataflow/snippets/src/main/java/com/example/dataflow/BigQueryWriteWithSchema.java b/dataflow/snippets/src/main/java/com/example/dataflow/BigQueryWriteWithSchema.java
@@ -0,0 +1,99 @@
+/*
+ * Copyright 2023 Google LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.example.dataflow;
+
+// [START dataflow_bigquery_write_with_schema]
+import com.google.api.services.bigquery.model.TableFieldSchema;
+import com.google.api.services.bigquery.model.TableRow;
+import com.google.api.services.bigquery.model.TableSchema;
+import java.util.Arrays;
+import java.util.List;
+import org.apache.beam.sdk.Pipeline;
+import org.apache.beam.sdk.coders.AvroCoder;
+import org.apache.beam.sdk.coders.DefaultCoder;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write;
+import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.CreateDisposition;
+import org.apache.beam.sdk.options.PipelineOptionsFactory;
+import org.apache.beam.sdk.transforms.Create;
+
+public class BigQueryWriteWithSchema {
+  // A custom datatype for the source data.
+  @DefaultCoder(AvroCoder.class)
+  public static class MyData {
+    public String name;
+    public Long age;
+
+    public MyData() {}
+
+    public MyData(String name, Long age) {
+      this.name = name;
+      this.age = age;
+    }
+  }
+
+  public static void main(String[] args) {
+    // Example source data.
+    final List<MyData> data = Arrays.asList(
+        new MyData("Alice", 40L),
+        new MyData("Bob", 30L),
+        new MyData("Charlie", 20L)
+    );
+
+    // Define a table schema. A schema is required for write disposition CREATE_IF_NEEDED.
+    TableSchema schema = new TableSchema()
+        .setFields(
+            Arrays.asList(
+                new TableFieldSchema()
+                    .setName("user_name")
+                    .setType("STRING")
+                    .setMode("REQUIRED"),
+                new TableFieldSchema()
+                    .setName("age")
+                    .setType("INT64") // Defaults to NULLABLE
+            )
+        );
+
+    // Parse the pipeline options passed into the application. Example:
+    //   --projectId=$PROJECT_ID --datasetName=$DATASET_NAME --tableName=$TABLE_NAME
+    // For more information, see https://beam.apache.org/documentation/programming-guide/#configuring-pipeline-options
+    PipelineOptionsFactory.register(ExamplePipelineOptions.class);
+    ExamplePipelineOptions options = PipelineOptionsFactory.fromArgs(args)
+        .withValidation()
+        .as(ExamplePipelineOptions.class);
+
+    // Create a pipeline and apply transforms.
+    Pipeline pipeline = Pipeline.create(options);
+    pipeline
+        // Create an in-memory PCollection of MyData objects.
+        .apply(Create.of(data))
+        // Write the data to a new or existing BigQuery table.
+        .apply(BigQueryIO.<MyData>write()
+            .to(String.format("%s:%s.%s",
+                options.getProjectId(),
+                options.getDatasetName(),
+                options.getTableName()))
+            .withFormatFunction(
+                (MyData x) -> new TableRow().set("user_name", x.name).set("age", x.age))
+            .withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED)
+            .withSchema(schema)
+            .withMethod(Write.Method.STORAGE_WRITE_API)
+        );
+    pipeline.run().waitUntilFinish();
+  }
+}
+// [END dataflow_bigquery_write_with_schema]
diff --git a/dataflow/snippets/src/main/java/com/example/dataflow/BiqQueryReadTableRows.java b/dataflow/snippets/src/main/java/com/example/dataflow/BiqQueryReadTableRows.java
@@ -30,10 +30,10 @@ public static void main(String[] args) {
     // Parse the pipeline options passed into the application. Example:
     //   --projectId=$PROJECT_ID --datasetName=$DATASET_NAME --tableName=$TABLE_NAME
     // For more information, see https://beam.apache.org/documentation/programming-guide/#configuring-pipeline-options
-    PipelineOptionsFactory.register(BigQueryReadOptions.class);
-    BigQueryReadOptions options = PipelineOptionsFactory.fromArgs(args)
+    PipelineOptionsFactory.register(ExamplePipelineOptions.class);
+    ExamplePipelineOptions options = PipelineOptionsFactory.fromArgs(args)
         .withValidation()
-        .as(BigQueryReadOptions.class);
+        .as(ExamplePipelineOptions.class);
 
     // Create a pipeline and apply transforms.
     Pipeline pipeline = Pipeline.create(options);
diff --git a/dataflow/snippets/src/main/java/com/example/dataflow/ExamplePipelineOptions.java b/dataflow/snippets/src/main/java/com/example/dataflow/ExamplePipelineOptions.java
@@ -17,24 +17,24 @@
 package com.example.dataflow;
 
 import org.apache.beam.sdk.options.Description;
-import org.apache.beam.sdk.options.PipelineOptions;
+import org.apache.beam.sdk.options.StreamingOptions;
 
 /**
  * Extends PipelineOptions and adds custom pipeline options for this sample.
  */
-public interface BigQueryReadOptions extends PipelineOptions {
+public interface ExamplePipelineOptions extends StreamingOptions {
   @Description("Project ID for the BigQuery table")
   String getProjectId();
 
-  void setProjectId(String input);
+  void setProjectId(String value);
 
   @Description("Dataset for the BigQuery table")
   String getDatasetName();
 
-  void setDatasetName(String output);
+  void setDatasetName(String value);
 
   @Description("BigQuery table name")
   String getTableName();
 
-  void setTableName(String output);
+  void setTableName(String value);
 }
diff --git a/dataflow/snippets/src/test/java/com/example/dataflow/BigQueryWriteIT.java b/dataflow/snippets/src/test/java/com/example/dataflow/BigQueryWriteIT.java