# Using the Spark BigQuery connector in Jupyter
Use a Python kernel (not PySpark) to allow configuring the SparkSession inside the notebook.

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
  .appName('Top Shakepeare words')\
  .config('spark.jars', 'gs://spark-lib/bigquery/spark-bigquery-latest.jar') \
  .getOrCreate()

Read the data in from BigQuery

In [2]:
df = spark.read \
  .format('bigquery') \
  .option('table', 'bigquery-public-data.samples.shakespeare') \
  .load()

Convert words to lower case and filter out stop words

In [3]:
from pyspark.ml.feature import StopWordsRemover
from pyspark.sql import functions as F

df = df.withColumn('lowered', F.array(F.lower(df.word)))

remover = StopWordsRemover(inputCol='lowered', outputCol='filtered')

df = remover.transform(df)

Create `(count, word)` struct and take the max of that in each corpus

In [4]:
df.select(df.corpus, F.struct(df.word_count, df.filtered.getItem(0).alias('word')).alias('count_word')) \
  .where(F.col('count_word').getItem('word').isNotNull()) \
  .groupby('corpus') \
  .agg({'count_word': 'max'}) \
  .orderBy('corpus') \
  .select(
     'corpus',
     F.col('max(count_word)').getItem('word').alias('word'),
     F.col('max(count_word)').getItem('word_count').alias('count')) \
  .toPandas()

Unnamed: 0,corpus,word,count
0,1kinghenryiv,henry,252
1,1kinghenryvi,thy,157
2,2kinghenryiv,falstaff,199
3,2kinghenryvi,thou,187
4,3kinghenryvi,king,249
5,allswellthatendswell,parolles,165
6,antonyandcleopatra,antony,284
7,asyoulikeit,rosalind,217
8,comedyoferrors,syracuse,204
9,coriolanus,coriolanus,207
