In [1]:
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 1.3. BigQuery Storage & Spark ML - Python

### Python 3 Kernel

Use a Python 3 kernel (not PySpark) to allow you to configure the SparkSession in the notebook and include the [spark-bigquery-connector](https://github.com/GoogleCloudDataproc/spark-bigquery-connector) required to use the [BigQuery Storage API](https://cloud.google.com/bigquery/docs/reference/storage).

### Scala Version

Check what version of Scala you are running so you can include the correct spark-bigquery-connector jar 

In [3]:
!scala -version

Scala code runner version 2.12.10 -- Copyright 2002-2019, LAMP/EPFL and Lightbend, Inc.


### Create Spark Session

Include the correct version of the spark-bigquery-connector jar

If you are using scala version 2.11 use `'gs://spark-lib/bigquery/spark-bigquery-latest.jar'`.

If you are using scala version 2.12 use `'gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar'`.

In [12]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
  .appName('BigQuery Storage &  Spark ML')\
  .config('spark.jars', 'gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar') \
  .getOrCreate()

In [3]:
# Read the data from BigQuery as a Spark Dataframe.
table  = 'bigquery-public-data.samples.natality'

df_natality_table = spark.read \
  .format("bigquery") \
  .option("table", table) \
  .load()

In [4]:
# limit no of rows that will be read for demo 
limit = 1000

df_natality_select = df_natality_table \
.select("weight_pounds", "mother_age", "father_age", "gestation_weeks", "weight_gain_pounds", "apgar_5min") \
.where("""
weight_pounds IS NOT NULL 
AND mother_age IS NOT NULL
AND father_age IS NOT NULL
AND gestation_weeks IS NOT NULL
AND weight_gain_pounds IS NOT NULL
AND apgar_5min IS NOT NULL
""") \
.limit(limit) \
.cache()

df_natality_select.printSchema()

root
 |-- weight_pounds: double (nullable = true)
 |-- mother_age: long (nullable = true)
 |-- father_age: long (nullable = true)
 |-- gestation_weeks: long (nullable = true)
 |-- weight_gain_pounds: long (nullable = true)
 |-- apgar_5min: long (nullable = true)



In [5]:
df_natality_select.count()

1000

In [6]:
# Create a view so that Spark SQL queries can be run against the data.
df_natality_select.createOrReplaceTempView("natality")

In [7]:
# Optional
# As a precaution, run a query in Spark SQL to ensure no NULL values exist.
spark_sql_query = """
SELECT *
from natality
where weight_pounds is not null
and mother_age is not null
and father_age is not null
and gestation_weeks is not null
"""

df_natality_select = spark.sql(spark_sql_query).cache()
df_natality_select.printSchema()

root
 |-- weight_pounds: double (nullable = true)
 |-- mother_age: long (nullable = true)
 |-- father_age: long (nullable = true)
 |-- gestation_weeks: long (nullable = true)
 |-- weight_gain_pounds: long (nullable = true)
 |-- apgar_5min: long (nullable = true)



In [8]:
# Create an input DataFrame for Spark ML using VectorAssembler.
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["mother_age", "father_age", "gestation_weeks", "weight_gain_pounds", "apgar_5min"],
    outputCol="features")

output = assembler.transform(df_natality_select)
output.show()

+------------------+----------+----------+---------------+------------------+----------+--------------------+
|     weight_pounds|mother_age|father_age|gestation_weeks|weight_gain_pounds|apgar_5min|            features|
+------------------+----------+----------+---------------+------------------+----------+--------------------+
|     8.62889293468|        34|        38|             41|                57|         9|[34.0,38.0,41.0,5...|
|      2.6786164833|        36|        39|             34|                23|         6|[36.0,39.0,34.0,2...|
|    11.06279630716|        38|        41|             41|                11|         9|[38.0,41.0,41.0,1...|
|     5.43659938092|        42|        42|             38|                10|         9|[42.0,42.0,38.0,1...|
|3.5604655312999998|        38|        43|             31|                18|         8|[38.0,43.0,31.0,1...|
|     5.99877814902|        37|        42|             39|                20|         9|[37.0,42.0,39.0,2...|
|     9.18

In [9]:
training_data = output.select("features", "weight_pounds").withColumnRenamed("weight_pounds","label")
training_data.cache()
training_data.show()

+--------------------+------------------+
|            features|             label|
+--------------------+------------------+
|[34.0,38.0,41.0,5...|     8.62889293468|
|[36.0,39.0,34.0,2...|      2.6786164833|
|[38.0,41.0,41.0,1...|    11.06279630716|
|[42.0,42.0,38.0,1...|     5.43659938092|
|[38.0,43.0,31.0,1...|3.5604655312999998|
|[37.0,42.0,39.0,2...|     5.99877814902|
|[28.0,36.0,38.0,3...|     9.18666245754|
|[26.0,27.0,36.0,1...|     4.87442061282|
|[36.0,39.0,40.0,1...| 8.000575487979999|
|[33.0,34.0,39.0,9...|      6.5256829552|
|[39.0,37.0,37.0,9...|      6.5697754076|
|[25.0,31.0,41.0,9...|      7.6610636045|
|[29.0,35.0,33.0,9...|     5.43659938092|
|[40.0,61.0,40.0,9...|     8.50102482272|
|[39.0,40.0,39.0,6...| 8.375361333379999|
|[29.0,34.0,34.0,4...|     4.87442061282|
|[36.0,39.0,33.0,3...|     5.00008410216|
|[41.0,99.0,40.0,1...| 8.811876612139999|
|[35.0,39.0,39.0,1...|     8.24969784404|
|[33.0,40.0,38.0,1...|      6.8122838958|
+--------------------+------------

In [10]:
# Construct a new LinearRegression object and fit the training data.
lr = LinearRegression(maxIter=5, regParam=0.2, solver="normal")
model = lr.fit(training_data)

Coefficients:[0.032105804524621015,-0.005448902107015991,0.20940221495235767,0.001514042532387989,0.0018310820530353473]
Intercept:-1.565575404366004
R^2:0.24565216749816365
+-------------------+
|          residuals|
+-------------------+
| 0.6216582894053992|
| -3.864594671990025|
| 3.0131311065978093|
|-2.1063194482441983|
| -2.393046931198261|
|-1.6081542977973298|
|  2.021483412756962|
|-1.8283327433775005|
| 0.2105982221406011|
| -1.280824235206806|
|-0.9942154737287696|
|-0.3237482859356007|
|-0.9796223995671598|
|0.40749514257792185|
| 0.6191313892123835|
|   -1.5100966032044|
|-1.3510133850752313|
| 1.1825653194895507|
| 0.5318890853072356|
|-0.6173778816298201|
+-------------------+
only showing top 20 rows



In [11]:
# Print the model summary.
print("Coefficients:" + str(model.coefficients))
print("Intercept:" + str(model.intercept))
print("R^2:" + str(model.summary.r2))
model.summary.residuals.show()

Coefficients:[0.032105804524621015,-0.005448902107015991,0.20940221495235767,0.001514042532387989,0.0018310820530353473]
Intercept:-1.565575404366004
R^2:0.24565216749816365
+-------------------+
|          residuals|
+-------------------+
| 0.6216582894053992|
| -3.864594671990025|
| 3.0131311065978093|
|-2.1063194482441983|
| -2.393046931198261|
|-1.6081542977973298|
|  2.021483412756962|
|-1.8283327433775005|
| 0.2105982221406011|
| -1.280824235206806|
|-0.9942154737287696|
|-0.3237482859356007|
|-0.9796223995671598|
|0.40749514257792185|
| 0.6191313892123835|
|   -1.5100966032044|
|-1.3510133850752313|
| 1.1825653194895507|
| 0.5318890853072356|
|-0.6173778816298201|
+-------------------+
only showing top 20 rows

