In [None]:
# Copyright 2020 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# 2.1. Google Cloud Storage (CSV) & Spark DataFrames - Python

### Create Dataproc Cluster with Jupyter

This notebook is designed to be run on Google Cloud Dataproc.

Follow the links below for instructions on how to create a Dataproc Cluster with the Juypter component installed.

* [Tutorial - Install and run a Jupyter notebook on a Dataproc cluster](https://cloud.google.com/dataproc/docs/tutorials/jupyter-notebook)
* [Blog post - Apache Spark and Jupyter Notebooks made easy with Dataproc component gateway](https://medium.com/google-cloud/apache-spark-and-jupyter-notebooks-made-easy-with-dataproc-component-gateway-fa91d48d6a5a)

### Python 3 Kernel

Use a Python 3 kernel (not PySpark) to allow you to configure the SparkSession in the notebook.

### Create Spark Session

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
  .appName('2.1. Google Cloud Storage (CSV) & Spark DataFrames') \
  .getOrCreate()

### Enable repl.eagerEval

This will output the results of DataFrames in each step without the need to use `df.show()` and also improves the formatting of the output

In [2]:
spark.conf.set("spark.sql.repl.eagerEval.enabled",True)

### List files in a GCS bucket

List files in a Google Cloud Storage bucket using the [google-cloud-storage python library](https://googleapis.dev/python/storage/latest/client.html) which comes installed on Dataproc clusters. We will be using a publicly available dataset.

In [3]:
from google.cloud import storage

gcs_client = storage.Client()
bucket = gcs_client.bucket('solutions-public-assets')

list(bucket.list_blobs(prefix='time-series-master/'))

[<Blob: solutions-public-assets, time-series-master/, 1423455996970000>,
 <Blob: solutions-public-assets, time-series-master/GBPUSD_2014_01.csv, 1423456343320000>,
 <Blob: solutions-public-assets, time-series-master/GBPUSD_2014_02.csv, 1423456332787000>]

Alternatively use the hdfs cmd to list files in a directory which supports GCS buckets

In [4]:
!hdfs dfs -ls 'gs://solutions-public-assets/time-series-master'

Found 2 items
-rwx------   3 root root   67868938 2015-02-09 04:32 gs://solutions-public-assets/time-series-master/GBPUSD_2014_01.csv
-rwx------   3 root root   61275261 2015-02-09 04:32 gs://solutions-public-assets/time-series-master/GBPUSD_2014_02.csv


### Read CSV files from GCS into Spark Dataframe

Read CSV files from GCS into a dataframe and infer the schema

In [5]:
df1 = spark \
  .read \
  .option ( "inferSchema" , "true" ) \
  .option ( "header" , "true" ) \
  .csv ( "gs://solutions-public-assets/time-series-master/GBPUSD_*.csv" )

df1.printSchema()

root
 |-- XYZ: string (nullable = true)
 |-- GBP/USD: string (nullable = true)
 |-- 2014-01-01 00:00:00.000000: timestamp (nullable = true)
 |-- 1.4995: double (nullable = true)
 |-- 1.5005: double (nullable = true)



In [17]:
df1

XYZ,GBP/USD,2014-01-01 00:00:00.000000,1.4995,1.5005
XYZ,GBP/USD,2014-01-01 00:00:...,1.4988,1.4998
XYZ,GBP/USD,2014-01-01 00:00:...,1.4979,1.4989
XYZ,GBP/USD,2014-01-01 00:00:...,1.4993,1.5003
XYZ,GBP/USD,2014-01-01 00:00:...,1.4989,1.4999
XYZ,GBP/USD,2014-01-01 00:00:...,1.4998,1.5008
XYZ,GBP/USD,2014-01-01 00:00:...,1.5001,1.5011
XYZ,GBP/USD,2014-01-01 00:00:...,1.4991,1.5001
XYZ,GBP/USD,2014-01-01 00:00:...,1.4978,1.4988
XYZ,GBP/USD,2014-01-01 00:00:...,1.4974,1.4984
XYZ,GBP/USD,2014-01-01 00:00:...,1.4987,1.4997


If there is no header with column names as we can see with the dataset here or the schema is not infered correctly then read CSV files from GCS and define schema

In [6]:
from pyspark.sql.types import StructType, StructField
from pyspark.sql.types import DoubleType, IntegerType, StringType, TimestampType, DateType

schema = StructType([
    StructField("venue", StringType()),
    StructField("currencies", StringType()),
    StructField("time_stamp", TimestampType()),
    StructField("bid", DoubleType()),
    StructField("ask", DoubleType())
])

df2 = spark \
  .read \
  .schema(schema) \
  .csv ( "gs://solutions-public-assets/time-series-master/GBPUSD_*.csv" )

df2.printSchema()

root
 |-- venue: string (nullable = true)
 |-- currencies: string (nullable = true)
 |-- time_stamp: timestamp (nullable = true)
 |-- bid: double (nullable = true)
 |-- ask: double (nullable = true)



View the top 20 rows of the spark dataframe

In [7]:
df2

venue,currencies,time_stamp,bid,ask
XYZ,GBP/USD,2014-01-01 00:00:00,1.4995,1.5005
XYZ,GBP/USD,2014-01-01 00:00:...,1.4988,1.4998
XYZ,GBP/USD,2014-01-01 00:00:...,1.4979,1.4989
XYZ,GBP/USD,2014-01-01 00:00:...,1.4993,1.5003
XYZ,GBP/USD,2014-01-01 00:00:...,1.4989,1.4999
XYZ,GBP/USD,2014-01-01 00:00:...,1.4998,1.5008
XYZ,GBP/USD,2014-01-01 00:00:...,1.5001,1.5011
XYZ,GBP/USD,2014-01-01 00:00:...,1.4991,1.5001
XYZ,GBP/USD,2014-01-01 00:00:...,1.4978,1.4988
XYZ,GBP/USD,2014-01-01 00:00:...,1.4974,1.4984


Print the shape of the dataframe. No of rows and no of columns

In [8]:
print((df2.count(), len(df2.columns)))

(2436683, 5)


Add hour column and filter the data to create a new dataframe with only 1 day of data

In [9]:
import pyspark.sql.functions as F

df3 = df2.withColumn("hour", F.hour(F.col("time_stamp"))) \
  .filter(df2['time_stamp'] >= F.lit('2014-01-01 00:00:00')) \
  .filter(df2['time_stamp'] < F.lit('2014-01-02 00:00:10')).cache()

df3

venue,currencies,time_stamp,bid,ask,hour
XYZ,GBP/USD,2014-01-01 00:00:00,1.4995,1.5005,0
XYZ,GBP/USD,2014-01-01 00:00:...,1.4988,1.4998,0
XYZ,GBP/USD,2014-01-01 00:00:...,1.4979,1.4989,0
XYZ,GBP/USD,2014-01-01 00:00:...,1.4993,1.5003,0
XYZ,GBP/USD,2014-01-01 00:00:...,1.4989,1.4999,0
XYZ,GBP/USD,2014-01-01 00:00:...,1.4998,1.5008,0
XYZ,GBP/USD,2014-01-01 00:00:...,1.5001,1.5011,0
XYZ,GBP/USD,2014-01-01 00:00:...,1.4991,1.5001,0
XYZ,GBP/USD,2014-01-01 00:00:...,1.4978,1.4988,0
XYZ,GBP/USD,2014-01-01 00:00:...,1.4974,1.4984,0


In [10]:
print((df3.count(), len(df3.columns)))

(41390, 6)


Group by hour and order by top_bids

In [11]:
import pyspark.sql.functions as F

df4 = df3 \
.groupBy("hour") \
.agg(F.sum('bid').alias('total_bids'))

df4.orderBy('total_bids', ascending=False)

hour,total_bids
12,4888.966399999975
13,4852.239699999989
14,4569.660199999988
15,4518.744800000002
8,2489.1048999999966
10,2431.141400000008
9,2424.1796000000018
18,2368.89479999999
19,2355.5363999999986
11,2347.3602999999907


### Write Spark Dataframe to Google Cloud Storage in CSV format

Write the Spark Dataframe to Google Cloud Storage using 

If the GCS bucket  does not exist it will need to be created before running `df.write`

- [Instructions here for creating a GCS bucket](https://cloud.google.com/storage/docs/creating-buckets)

In [12]:
# Update to your GCS bucket
gcs_bucket = 'dataproc-bucket-name'

gcs_filepath = 'gs://{}/currency/hourly_bids.csv'.format(gcs_bucket)

df4.coalesce(1).write \
  .mode('overwrite') \
  .csv(gcs_filepath)

Read the CSV file into new DataFrame to check it was successfuly saved

In [13]:
!hdfs dfs -ls gs://dataproc-bucket-name/currency

Found 1 items
drwx------   - root root          0 2020-03-27 17:08 gs://dataproc-bucket-name/currency/hourly_bids.csv


In [14]:
df5 = spark.read \
  .option ( "inferSchema" , "true" ) \
  .option ( "header" , "true" ) \
  .csv('gs://dataproc-bucket-name/currency/*')

df5

12,4888.966399999975
22,1512.3183999999997
1,1040.8376999999998
13,4852.239699999989
6,1662.2480999999957
16,2295.8521
3,1032.2795000000003
20,1572.7295999999976
5,1726.0291000000027
19,2355.5363999999986
15,4518.744800000002
