In [3]:
#******************************************************
#*
#* Name:         nb-04-explore-json-files
#*     
#* Design Phase:
#*     Author:   John Miner
#*     Date:     12-04-2024
#*     Purpose:  Shortcut vs dataframe commands.
#*               Managed vs unmanaged tables.
#* 
#******************************************************/


StatementMeta(, b28e61f8-6919-4645-9c1c-b01631ce7bb5, 7, Finished, Available, Finished)

In [2]:
%%sql

 --
 --  1 - Create managed JSON table
 --

 -- del
drop table if exists json_quakes_ex1;

-- add
 create table if not exists json_quakes_ex1 as
    select * from json.`Files/Usgs/earthquakes.json`

StatementMeta(, f0558144-8f9c-47af-89df-b6f9495b3b29, 5, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

In [3]:
%%sql

--
-- 2 - get table info
--

describe table extended json_quakes_ex1

StatementMeta(, f0558144-8f9c-47af-89df-b6f9495b3b29, 6, Finished, Available, Finished)

<Spark SQL result set with 13 rows and 3 fields>

In [28]:
%%sql

--
-- 3 - pick fields from structures
--

select 
  id, 
  geometry.coordinates, 
  properties.type,
  properties.mag,
  properties.place,
  from_unixtime(properties.time / 1000) as ts,
  properties.tz / 60 as tz
from json_quakes_ex1 
where id is not null
limit 50


StatementMeta(, 4f2a45c4-e63e-45cb-b39d-bb2283989dae, 30, Finished, Available, Finished)

<Spark SQL result set with 50 rows and 8 fields>

In [48]:
%%sql

--
--  4 - convert JSON to schema string
--

select schema_of_json('{ "type": "FeatureCollection", "metadata": { "generated": 1517968154000, "url": "https://earthquake.usgs.gov/earthquakes/feed/v1.0/summary/all_week.geojson", "title": "USGS All Earthquakes, Past Week", "status": 200, "api": "1.5.8", "count": 1707 }, "features": [ { "type": "Feature", "properties": { "mag": 2, "place": "4km W of Castaic, CA", "time": 1517966773840, "updated": 1517966996303, "tz": -480, "url": "https://earthquake.usgs.gov/earthquakes/eventpage/ci37868143", "detail": "https://earthquake.usgs.gov/earthquakes/feed/v1.0/detail/ci37868143.geojson", "felt": null, "cdi": null, "mmi": null, "alert": null, "status": "automatic", "tsunami": 0, "sig": 62, "net": "ci", "code": "37868143", "ids": ",ci37868143,", "sources": ",ci,", "types": ",geoserve,nearby-cities,origin,phase-data,scitech-link,", "nst": 7, "dmin": 0.04214, "rms": 0.35, "gap": 174, "magType": "ml", "type": "earthquake", "title": "M 2.0 - 4km W of Castaic, CA" }, "geometry": { "type": "Point", "coordinates": [ -118.6671667, 34.4945, 26.49 ] }, "id": "ci37868143" } ], "bbox": [ -179.6445, -65.8617, -2.79, 178.8275, 83.0422, 573.76 ] }')


StatementMeta(, 4f2a45c4-e63e-45cb-b39d-bb2283989dae, 50, Finished, Available, Finished)

<Spark SQL result set with 1 rows and 1 fields>

In [49]:
#
#  5 - convert schema string to object
#

# include types lib
import pyspark.sql.types as t

# paste in text of schema
schema_txt = "STRUCT<bbox: ARRAY<DOUBLE>, features: ARRAY<STRUCT<geometry: STRUCT<coordinates: ARRAY<DOUBLE>, type: STRING>, id: STRING, properties: STRUCT<alert: STRING, cdi: STRING, code: STRING, detail: STRING, dmin: DOUBLE, felt: STRING, gap: BIGINT, ids: STRING, mag: BIGINT, magType: STRING, mmi: STRING, net: STRING, nst: BIGINT, place: STRING, rms: DOUBLE, sig: BIGINT, sources: STRING, status: STRING, time: BIGINT, title: STRING, tsunami: BIGINT, type: STRING, types: STRING, tz: BIGINT, updated: BIGINT, url: STRING>, type: STRING>>, metadata: STRUCT<api: STRING, count: BIGINT, generated: BIGINT, status: BIGINT, title: STRING, url: STRING>, type: STRING>"

# parse to data type string
custom_schema = t._parse_datatype_string(schema_txt)

# show result
print(custom_schema)


StatementMeta(, 4f2a45c4-e63e-45cb-b39d-bb2283989dae, 51, Finished, Available, Finished)

StructType([StructField('bbox', ArrayType(DoubleType(), True), True), StructField('features', ArrayType(StructType([StructField('geometry', StructType([StructField('coordinates', ArrayType(DoubleType(), True), True), StructField('type', StringType(), True)]), True), StructField('id', StringType(), True), StructField('properties', StructType([StructField('alert', StringType(), True), StructField('cdi', StringType(), True), StructField('code', StringType(), True), StructField('detail', StringType(), True), StructField('dmin', DoubleType(), True), StructField('felt', StringType(), True), StructField('gap', LongType(), True), StructField('ids', StringType(), True), StructField('mag', LongType(), True), StructField('magType', StringType(), True), StructField('mmi', StringType(), True), StructField('net', StringType(), True), StructField('nst', LongType(), True), StructField('place', StringType(), True), StructField('rms', DoubleType(), True), StructField('sig', LongType(), True), StructFiel

In [66]:
#
#  6 - spark.read has the most options (4m 43s)
#

# define path
path1 = "Files/Usgs/earthquakes.json"

# read in csv data
df1 = (
  spark.read              
  .schema(custom_schema)
  .option("multiline", "true")
  .json(path2)               
)

# create view
df1.createOrReplaceTempView('tmp_earthquakes')


StatementMeta(, 4f2a45c4-e63e-45cb-b39d-bb2283989dae, 68, Finished, Available, Finished)

In [83]:

%%sql

--
--  7 - test query - must explode features
--

with cte as
(
  select explode(features) as d from tmp_earthquakes
)
select 
  d.id, 
  d.geometry.coordinates, 
  d.properties.type,
  d.properties.mag,
  d.properties.place,
  from_unixtime(d.properties.time / 1000) as ts,
  d.properties.tz / 60 as tz
from cte 


StatementMeta(, 4f2a45c4-e63e-45cb-b39d-bb2283989dae, 85, Finished, Available, Finished)

<Spark SQL result set with 1000 rows and 7 fields>

In [89]:

#
#  8 - unlike example one, this has the converted data
#

# format data before storing
stmt = """
with cte as
(
  select explode(features) as d from tmp_earthquakes
)
select 
  d.id, 
  d.geometry.coordinates, 
  d.properties.type,
  d.properties.mag,
  d.properties.place,
  from_unixtime(d.properties.time / 1000) as ts,
  d.properties.tz / 60 as tz
from cte 
"""
df2 = spark.sql(stmt)

# store data as delta table
df2.write.saveAsTable("json_quakes_ex3")


StatementMeta(, 4f2a45c4-e63e-45cb-b39d-bb2283989dae, 92, Finished, Available, Finished)

In [90]:
%%sql

--
-- 9 - get table info
--

describe table extended json_quakes_ex3

StatementMeta(, 4f2a45c4-e63e-45cb-b39d-bb2283989dae, 93, Finished, Available, Finished)

<Spark SQL result set with 15 rows and 3 fields>

In [91]:
%%sql

 --
 --  10 - Create unmanaged JSON table
 --

-- del
drop table if exists json_quakes_ex2;

-- add
create table if not exists json_quakes_ex2
using json 
location "Files/Usgs/earthquakes.json"


StatementMeta(, 4f2a45c4-e63e-45cb-b39d-bb2283989dae, 95, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

<Spark SQL result set with 0 rows and 0 fields>

In [92]:
%%sql

--
-- 11 - get table info
--

describe table extended json_quakes_ex2

StatementMeta(, 4f2a45c4-e63e-45cb-b39d-bb2283989dae, 96, Finished, Available, Finished)

<Spark SQL result set with 20 rows and 3 fields>

In [93]:
%%sql

--
--  11 - counts should be the same regardless of format
--

select 'managed table using sql', count(*) as rec_cnt from json_quakes_ex1
union
select 'unmanaged table using sql', count(*) as rec_cnt from json_quakes_ex2
union
select 'managed table using spark', count(*) as rec_cnt from json_quakes_ex3



StatementMeta(, 4f2a45c4-e63e-45cb-b39d-bb2283989dae, 97, Finished, Available, Finished)

<Spark SQL result set with 3 rows and 2 fields>