In [3]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("BigqueryExample")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
# Load data from BigQuery.
df = spark.read \
  .format("bigquery") \
  .load("de2022-362409.labdataset.retaildata")    # project_id.datatset.tablename. Use your project id
df.printSchema()
df.show(4)

root
 |-- InvoiceNo: long (nullable = true)
 |-- StockCode: long (nullable = true)
 |-- Description: string (nullable = true)
 |-- Quantity: long (nullable = true)
 |-- InvoiceDate: string (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- CustomerID: long (nullable = true)
 |-- Country: string (nullable = true)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536369|    21756|BATH BUILDING BLO...|       3|12/1/2010 8:35|     5.95|     13047|United Kingdom|
|   536365|    85123|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536370|    21035|SET/2 RED RETROSP...|      18|12/

In [5]:
from pyspark.sql.functions import explode, split, concat, col, lit, desc

words = df.where(df.Country == "France").select(
            explode(
                split(col("Description"), " ")
            ).alias("word")
        )
words.show(10)

ordered_word_count = words.groupby(words.word).count().orderBy(col("count").desc())
ordered_word_count.show(100)

+---------+
|     word|
+---------+
|    SET/2|
|      RED|
|RETROSPOT|
|      TEA|
|   TOWELS|
|    ROUND|
|    SNACK|
|    BOXES|
|      SET|
|      OF4|
+---------+
only showing top 10 rows

+---------+-----+
|     word|count|
+---------+-----+
|      BOX|    1|
|      RED|    1|
|      TEA|    1|
|      OF4|    1|
|    SNACK|    1|
| WOODLAND|    1|
|    TAILS|    1|
|     CARD|    1|
|    ROUND|    1|
|    SET/2|    1|
|    LUNCH|    1|
|      SET|    1|
|   CIRCUS|    1|
|      AND|    1|
|     GAME|    1|
|   PARADE|    1|
|    BOXES|    1|
|  VINTAGE|    1|
|    HEADS|    1|
|RETROSPOT|    1|
|   TOWELS|    1|
+---------+-----+



In [6]:
# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector.
bucket = "de_jads_temp"  # use your bucket 
spark.conf.set('temporaryGcsBucket', bucket)
# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
# Saving the data to BigQuery
ordered_word_count.write.format('bigquery') \
  .option('table', 'de2022-362409.labdataset.wordcounts') \
  .mode("append") \
  .save()

In [7]:
# Stop the spark context
spark.stop()