In [1]:
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 1.3. BigQuery Storage & Spark MLlib - Python

Use the BigQuery storage connector and [Spark MLlib](https://spark.apache.org/docs/latest/ml-guide.html) to build a Linear Regression model and make predictions

### Create Dataproc Cluster with Jupyter

This notebook is designed to be run on Google Cloud Dataproc.

Follow the links below for instructions on how to create a Dataproc Cluster with the Juypter component installed.

* [Tutorial - Install and run a Jupyter notebook on a Dataproc cluster](https://cloud.google.com/dataproc/docs/tutorials/jupyter-notebook)
* [Blog post - Apache Spark and Jupyter Notebooks made easy with Dataproc component gateway](https://medium.com/google-cloud/apache-spark-and-jupyter-notebooks-made-easy-with-dataproc-component-gateway-fa91d48d6a5a)

### Python 3 Kernel

Use a Python 3 kernel (not PySpark) to allow you to configure the SparkSession in the notebook and include the [spark-bigquery-connector](https://github.com/GoogleCloudDataproc/spark-bigquery-connector) required to use the [BigQuery Storage API](https://cloud.google.com/bigquery/docs/reference/storage).

### Scala Version

Check what version of Scala you are running so you can include the correct spark-bigquery-connector jar 

In [2]:
!scala -version

cat: /release: No such file or directory
Scala code runner version 2.11.12 -- Copyright 2002-2017, LAMP/EPFL


### Create Spark Session

Include the correct version of the spark-bigquery-connector jar

If you are using scala version 2.11 use `'gs://spark-lib/bigquery/spark-bigquery-latest.jar'`.

If you are using scala version 2.12 use `'gs://spark-lib/bigquery/spark-bigquery-latest_2.12.jar'`.

In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
  .appName('1.3. BigQuery Storage &  Spark MLlib - Python')\
  .config('spark.jars', 'gs://spark-lib/bigquery/spark-bigquery-latest.jar') \
  .getOrCreate()

spark.version

'2.4.5'

### Enable repl.eagerEval

This will output the results of DataFrames in each step without the new need to show `df.show()` and also improves the formatting of the output

In [4]:
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

### Read the data from BigQuery as a Spark Dataframe

In [5]:
table  = 'bigquery-public-data.samples.natality'

df_natality_table = spark.read \
  .format("bigquery") \
  .option("table", table) \
  .load()

### Limit no of rows and cache data

limit the no of rows that will be read for this example to run faster. 

The DataFrame is cached as LinearRegression is iterative and this avoids re-reading the data from BigQuery Storage for each iteration

In [6]:
limit = 10000

df_natality_select = df_natality_table \
.select("weight_pounds", "mother_age", "father_age", "gestation_weeks", "weight_gain_pounds", "apgar_5min") \
.where("""
weight_pounds IS NOT NULL 
AND mother_age IS NOT NULL
AND father_age IS NOT NULL
AND gestation_weeks IS NOT NULL
AND weight_gain_pounds IS NOT NULL
AND apgar_5min IS NOT NULL
""") \
.limit(limit) \
.cache()

df_natality_select.printSchema()

root
 |-- weight_pounds: double (nullable = true)
 |-- mother_age: long (nullable = true)
 |-- father_age: long (nullable = true)
 |-- gestation_weeks: long (nullable = true)
 |-- weight_gain_pounds: long (nullable = true)
 |-- apgar_5min: long (nullable = true)



### Optional 
#### Run count to check no of rows in DataFrame

In [7]:
df_natality_select.count()

10000

### Create an input DataFrame for Spark MLlib using VectorAssembler

Spark MLlib estimators expect a single vector column for features. Multiple columns can be converted to a single vector column using [VectorAssembler](https://spark.apache.org/docs/latest/ml-features#vectorassembler)

In [8]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=["mother_age", "father_age", "gestation_weeks", "weight_gain_pounds", "apgar_5min"],
    outputCol="features")

df_assembler_output = assembler.transform(df_natality_select)
df_assembler_output

weight_pounds,mother_age,father_age,gestation_weeks,weight_gain_pounds,apgar_5min,features
6.3713593718,31,34,37,30,9,"[31.0,34.0,37.0,3..."
6.393405598,36,35,38,99,9,"[36.0,35.0,38.0,9..."
6.21262654316,34,34,37,56,9,"[34.0,34.0,37.0,5..."
9.67608867918,31,49,37,99,8,"[31.0,49.0,37.0,9..."
6.3382900325,43,47,34,18,9,"[43.0,47.0,34.0,1..."
7.3193470984,43,46,44,99,99,"[43.0,46.0,44.0,9..."
7.06140625186,29,30,40,99,99,"[29.0,30.0,40.0,9..."
6.20821729792,35,34,39,99,99,"[35.0,34.0,39.0,9..."
7.8484565272,39,45,40,23,10,"[39.0,45.0,40.0,2..."
5.8312268299,41,43,38,35,9,"[41.0,43.0,38.0,3..."


### Create a training data DataFrame

Create a training data DataFrame with just the features and label column.

Cache the training data table.

In [9]:
df_training_data = df_assembler_output \
.select("features", "weight_pounds") \
.withColumnRenamed("weight_pounds","label")

df_training_data.cache()
df_training_data

features,label
"[31.0,34.0,37.0,3...",6.3713593718
"[36.0,35.0,38.0,9...",6.393405598
"[34.0,34.0,37.0,5...",6.21262654316
"[31.0,49.0,37.0,9...",9.67608867918
"[43.0,47.0,34.0,1...",6.3382900325
"[43.0,46.0,44.0,9...",7.3193470984
"[29.0,30.0,40.0,9...",7.06140625186
"[35.0,34.0,39.0,9...",6.20821729792
"[39.0,45.0,40.0,2...",7.8484565272
"[41.0,43.0,38.0,3...",5.8312268299


### Split the data into training and test sets

30% held out for testing

In [10]:
(df_training, df_test) = df_training_data.randomSplit([0.7, 0.3])

### Construct a new LinearRegression object and fit the training data

Import and use and the [LinearRegression model](https://spark.apache.org/docs/latest/ml-classification-regression.html#linear-regression)

In [11]:
from pyspark.ml.regression import LinearRegression

lr = LinearRegression(maxIter=5, regParam=0.2, solver="normal")

model = lr.fit(df_training)

### Print the coefficients and intercept for linear regression

In [12]:
print("Coefficients:" + str(model.coefficients))
print("Intercept:" + str(model.intercept))

Coefficients:[0.024976900994897657,-0.00405761639159605,0.27142700913671225,-0.00020640385689556944,0.0008561742907835546]
Intercept:-3.8299366375006008


### Summarize the model over the training data and print metrics

In [13]:
trainingSummary = model.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

trainingSummary.residuals

numIterations: 1
objectiveHistory: [0.0]
RMSE: 1.205822
r2: 0.391391


residuals
-0.0642712009781352
1.4054942809910722
1.615669149519861
1.8461326439359496
0.0703286005329788
-1.43244194979542
-1.7109598506752508
2.995209709060892
0.2134198724115039
1.267985188300587


### Make predictions on test data

In [15]:
predictions = model.transform(df_test)
predictions.select("prediction", "label", "features")

prediction,label,features
6.735331281174319,6.25671899556,"[13.0,17.0,38.0,6..."
7.315888778823852,8.437090766739999,"[14.0,16.0,40.0,1..."
7.372168136194236,7.50012615324,"[14.0,17.0,40.0,9..."
6.7400855825755,5.37486994756,"[14.0,20.0,38.0,9..."
5.682308546399798,5.8753192823,"[14.0,99.0,35.0,9..."
4.33033463412318,3.06222081918,"[15.0,18.0,29.0,9..."
7.134663854645244,9.06320359082,"[15.0,18.0,39.0,3..."
6.58610098912506,8.000575487979999,"[15.0,19.0,37.0,4..."
7.307916501843823,6.4374980504,"[15.0,20.0,40.0,9..."
6.53890439878406,5.8753192823,"[15.0,99.0,38.0,1..."


### Select (prediction, true label) and compute test error

In [16]:
from pyspark.ml.evaluation import RegressionEvaluator

evaluator = RegressionEvaluator(
    labelCol="label", predictionCol="prediction", metricName="rmse")

rmse = evaluator.evaluate(predictions)
print("Root Mean Squared Error (RMSE) on test data = %g" % rmse)

Root Mean Squared Error (RMSE) on test data = 1.18882
